{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Illiberal Communication and Election Intervention During the Refugee Crisis in Germany\n",
    "\n",
    "Ashrakat Elshehawy, Konstantin Gavras, Nikolay Marinov, Federico Nanni, Harald Schoen\n",
    "\n",
    "Perspectives on Politics\n",
    "\n",
    "Dataverse link:\n",
    "\n",
    "\"Replication Data for: Illiberal Communication and Election Intervention During the Refugee Crisis in Germany\", https://doi.org/10.7910/DVN/T2FZK3, Harvard Dataverse, DRAFT VERSION, UNF:6:L4g980UvlhsPseyzqCxmKw== [fileUNF]\n",
    "\n",
    "Script to get overall score for each of the topics for each newspaper article, code implemented by Ashrakat Elshehawy and Federico Nanni"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note: Please change the path to all files to your local path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/ashrakatelshehawy/opt/anaconda3/lib/python3.8/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
      "  warnings.warn(msg)\n"
     ]
    }
   ],
   "source": [
    "#import libraries\n",
    "import codecs, nltk, string, os, gensim\n",
    "import numpy as np\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Text Preprocessing and Embedding\n",
    "\n",
    "exclude = set(string.punctuation)\n",
    "\n",
    "# this represent any text as a single \"doc-embedding\" we use it both for the query and the sentences\n",
    "# input should be a string\n",
    "def text_embedding(text):\n",
    "    \n",
    "\n",
    "    \n",
    "    # we tokenize the text in single words\n",
    "    text = nltk.tokenize.WordPunctTokenizer().tokenize(text)\n",
    "    \n",
    "    # we remove numbers and punctuation\n",
    "    text = [token for token in text if token not in exclude and token.isalpha()]\n",
    "    \n",
    "    \n",
    "    doc_embed = []\n",
    "    \n",
    "    # for each word we get the embedding and we append it to a list\n",
    "    for word in text:\n",
    "        if word in emb_model:\n",
    "            embed_word = emb_model[word]\n",
    "            doc_embed.append(embed_word)\n",
    "        else:\n",
    "            if word.lower() in emb_model:\n",
    "                embed_word = emb_model[word.lower()]\n",
    "                doc_embed.append(embed_word)\n",
    "            \n",
    "    # we average the embeddings of all the words, getting an overall doc embedding\n",
    "    if len(doc_embed)>0:\n",
    "        avg = [float(sum(col))/len(col) for col in zip(*doc_embed)]\n",
    "\n",
    "        avg = np.array(avg).reshape(1, -1)\n",
    "\n",
    "        # the output is a doc-embedding\n",
    "        return avg\n",
    "    else:\n",
    "        return \"Empty\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Wikipedia Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add the path to the embedding_file \n",
    "# we will run the analysis once with each of these embedding files:\n",
    "# 1) the wikipedia embeddings 2) indomain embeddings\n",
    "\n",
    "\n",
    "#the files can take a bit to load\n",
    "embed_file = '/Users/ashrakatelshehawy/Embeddings/wiki.de.vec'\n",
    "\n",
    "emb_model = gensim.models.KeyedVectors.load_word2vec_format(embed_file, binary=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "verspekulieren\n",
      "Weltlenker\n",
      "Meinungstyrannen\n"
     ]
    }
   ],
   "source": [
    "#add dictionary of keywords\n",
    "topics = open(\"/Users/ashrakatelshehawy/Dictionaries/topics-classification.txt\",\"r\").read().strip().split(\"\\n\")\n",
    "topics = [x.split(\"\\t\") for x in topics if len(x)>1]\n",
    "\n",
    "\n",
    "#embedd dictionary\n",
    "topics_emb = {}\n",
    "\n",
    "for topic in topics:\n",
    "    label = topic[0]\n",
    "    words = topic[1].split(\",\")\n",
    "    words = [x.strip().replace('\"','') for x in words]\n",
    "    topic_emb = text_embedding(\" \".join(words))\n",
    "    topics_emb[label] = topic_emb\n",
    "    for word in words:\n",
    "        if len(text_embedding(word))>1 and len(text_embedding(word.lower()))>1:\n",
    "            print (word)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sueddeutsche_relevant-migrant-news.csv\n",
      "3832\n",
      "0 3832\n",
      "100 3832\n",
      "200 3832\n",
      "300 3832\n",
      "400 3832\n",
      "500 3832\n",
      "600 3832\n",
      "700 3832\n",
      "800 3832\n",
      "900 3832\n",
      "1000 3832\n",
      "1100 3832\n",
      "1200 3832\n",
      "1300 3832\n",
      "1400 3832\n",
      "1500 3832\n",
      "1600 3832\n",
      "1700 3832\n",
      "1800 3832\n",
      "1900 3832\n",
      "2000 3832\n",
      "2100 3832\n",
      "2200 3832\n",
      "2300 3832\n",
      "2400 3832\n",
      "2500 3832\n",
      "2600 3832\n",
      "2700 3832\n",
      "2800 3832\n",
      "2900 3832\n",
      "3000 3832\n",
      "3100 3832\n",
      "3200 3832\n",
      "3300 3832\n",
      "3400 3832\n",
      "3500 3832\n",
      "3600 3832\n",
      "3700 3832\n",
      "3800 3832\n",
      "sputnik_relevant-migrant-news.csv\n",
      "3324\n",
      "0 3324\n",
      "100 3324\n",
      "200 3324\n",
      "300 3324\n",
      "400 3324\n",
      "500 3324\n",
      "600 3324\n",
      "700 3324\n",
      "800 3324\n",
      "900 3324\n",
      "1000 3324\n",
      "1100 3324\n",
      "1200 3324\n",
      "1300 3324\n",
      "1400 3324\n",
      "1500 3324\n",
      "1600 3324\n",
      "1700 3324\n",
      "1800 3324\n",
      "1900 3324\n",
      "2000 3324\n",
      "2100 3324\n",
      "2200 3324\n",
      "2300 3324\n",
      "2400 3324\n",
      "2500 3324\n",
      "2600 3324\n",
      "2700 3324\n",
      "2800 3324\n",
      "2900 3324\n",
      "3000 3324\n",
      "3100 3324\n",
      "3200 3324\n",
      "3300 3324\n",
      "welt_relevant-migrant-news.csv\n",
      "4073\n",
      "0 4073\n",
      "100 4073\n",
      "200 4073\n",
      "300 4073\n",
      "400 4073\n",
      "500 4073\n",
      "600 4073\n",
      "700 4073\n",
      "800 4073\n",
      "900 4073\n",
      "1000 4073\n",
      "1100 4073\n",
      "1200 4073\n",
      "1300 4073\n",
      "1400 4073\n",
      "1500 4073\n",
      "1600 4073\n",
      "1700 4073\n",
      "1800 4073\n",
      "1900 4073\n",
      "2000 4073\n",
      "2100 4073\n",
      "2200 4073\n",
      "2300 4073\n",
      "2400 4073\n",
      "2500 4073\n",
      "2600 4073\n",
      "2700 4073\n",
      "2800 4073\n",
      "2900 4073\n",
      "3000 4073\n",
      "3100 4073\n",
      "3200 4073\n",
      "3300 4073\n",
      "3400 4073\n",
      "3500 4073\n",
      "3600 4073\n",
      "3700 4073\n",
      "3800 4073\n",
      "3900 4073\n",
      "4000 4073\n",
      "bild_relevant-migrant-news.csv\n",
      "917\n",
      "0 917\n",
      "100 917\n",
      "200 917\n",
      "300 917\n",
      "400 917\n",
      "500 917\n",
      "600 917\n",
      "700 917\n",
      "800 917\n",
      "900 917\n",
      "rt_relevant-migrant-news.csv\n",
      "744\n",
      "0 744\n",
      "100 744\n",
      "200 744\n",
      "300 744\n",
      "400 744\n",
      "500 744\n",
      "600 744\n",
      "700 744\n",
      "taz_relevant-migrant-news.csv\n",
      "3361\n",
      "0 3361\n",
      "100 3361\n",
      "200 3361\n",
      "300 3361\n",
      "400 3361\n",
      "500 3361\n",
      "600 3361\n",
      "700 3361\n",
      "800 3361\n",
      "900 3361\n",
      "1000 3361\n",
      "1100 3361\n",
      "1200 3361\n",
      "1300 3361\n",
      "1400 3361\n",
      "1500 3361\n",
      "1600 3361\n",
      "1700 3361\n",
      "1800 3361\n",
      "1900 3361\n",
      "2000 3361\n",
      "2100 3361\n",
      "2200 3361\n",
      "2300 3361\n",
      "2400 3361\n",
      "2500 3361\n",
      "2600 3361\n",
      "2700 3361\n",
      "2800 3361\n",
      "2900 3361\n",
      "3000 3361\n",
      "3100 3361\n",
      "3200 3361\n",
      "3300 3361\n",
      "faz_relevant-migrant-news.csv\n",
      "9199\n",
      "0 9199\n",
      "100 9199\n",
      "200 9199\n",
      "300 9199\n",
      "400 9199\n",
      "500 9199\n",
      "600 9199\n",
      "700 9199\n",
      "800 9199\n",
      "900 9199\n",
      "1000 9199\n",
      "1100 9199\n",
      "1200 9199\n",
      "1300 9199\n",
      "1400 9199\n",
      "1500 9199\n",
      "1600 9199\n",
      "1700 9199\n",
      "1800 9199\n",
      "1900 9199\n",
      "2000 9199\n",
      "2100 9199\n",
      "2200 9199\n",
      "2300 9199\n",
      "2400 9199\n",
      "2500 9199\n",
      "2600 9199\n",
      "2700 9199\n",
      "2800 9199\n",
      "2900 9199\n",
      "3000 9199\n",
      "3100 9199\n",
      "3200 9199\n",
      "3300 9199\n",
      "3400 9199\n",
      "3500 9199\n",
      "3600 9199\n",
      "3700 9199\n",
      "3800 9199\n",
      "3900 9199\n",
      "4000 9199\n",
      "4100 9199\n",
      "4200 9199\n",
      "4300 9199\n",
      "4400 9199\n",
      "4500 9199\n",
      "4600 9199\n",
      "4700 9199\n",
      "4800 9199\n",
      "4900 9199\n",
      "5000 9199\n",
      "5100 9199\n",
      "5200 9199\n",
      "5300 9199\n",
      "5400 9199\n",
      "5500 9199\n",
      "5600 9199\n",
      "5700 9199\n",
      "5800 9199\n",
      "5900 9199\n",
      "6000 9199\n",
      "6100 9199\n",
      "6200 9199\n",
      "6300 9199\n",
      "6400 9199\n",
      "6500 9199\n",
      "6600 9199\n",
      "6700 9199\n",
      "6800 9199\n",
      "6900 9199\n",
      "7000 9199\n",
      "7100 9199\n",
      "7200 9199\n",
      "7300 9199\n",
      "7400 9199\n",
      "7500 9199\n",
      "7600 9199\n",
      "7700 9199\n",
      "7800 9199\n",
      "7900 9199\n",
      "8000 9199\n",
      "8100 9199\n",
      "8200 9199\n",
      "8300 9199\n",
      "8400 9199\n",
      "8500 9199\n",
      "8600 9199\n",
      "8700 9199\n",
      "8800 9199\n",
      "8900 9199\n",
      "9000 9199\n",
      "9100 9199\n"
     ]
    }
   ],
   "source": [
    "\n",
    "path1 = \"/Users/ashrakatelshehawy/Refugee Relevant Media pieces/\"\n",
    "\n",
    "\n",
    "out = open(\"refugee-wikiemb-class-score.csv\",\"w\") #file output\n",
    "out.write(\"url,n_words,FinanceEconomy,DegradationCrime,PopulismConspiracyColRevolt\\n\")\n",
    "\n",
    "for filename in os.listdir(path1):\n",
    "    if \".csv\" in filename:\n",
    "        print (filename)\n",
    "        newspaper = open(path1+filename,\"r\").read().strip().split(\"\\n\")\n",
    "        print (len(newspaper))\n",
    "        for j in range(len(newspaper)):\n",
    "            article = newspaper[j]\n",
    "            article = article.split(\"\\t\")\n",
    "            url = article[0]\n",
    "            title = article[3]\n",
    "            content = article[4]\n",
    "            title_cont = title + \" \"+ content\n",
    "            n_words = str(len(title_cont.split(\" \")))\n",
    "            emb = text_embedding(title_cont)\n",
    "            scores = [url,n_words]\n",
    "            for topic,t_emb in topics_emb.items():\n",
    "                cs = cosine_similarity(emb, t_emb)[0][0]\n",
    "                scores.append(str(cs))\n",
    "            scores = \",\".join(scores)\n",
    "            out.write(scores+\"\\n\")\n",
    "            if j%100 == 0:\n",
    "                print (j,len(newspaper))\n",
    "out.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In-Domain Embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# add the path to the embedding_file \n",
    "#the files can take a bit to load\n",
    "embed_file = '/Users/ashrakatelshehawy/Embeddings/in-domain-embeddings.txt'\n",
    "\n",
    "emb_model = gensim.models.KeyedVectors.load_word2vec_format(embed_file, binary=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Steuernachzahlung\n",
      "Steuerbescheid\n",
      "Fondsvermögen\n",
      "steuerbefreit\n",
      "Bitcoin\n",
      "Zahlungssystem\n",
      "Kursverlust\n",
      "Philanthrop\n",
      "eklig\n",
      "unmoral\n",
      "Seitensprung\n",
      "ausbeult\n",
      "Vielehe\n",
      "Weltlenker\n",
      "Verschwörung\n",
      "Geheimdokumente\n",
      "Meinungstyrannen\n",
      "Soros\n",
      "Euromaidan\n",
      "<class 'list'>\n"
     ]
    }
   ],
   "source": [
    "#add dictionary\n",
    "topics = open(\"/Users/ashrakatelshehawy/Dictionaries/topics-classification.txt\",\"r\").read().strip().split(\"\\n\")\n",
    "topics = [x.split(\"\\t\") for x in topics if len(x)>1]\n",
    "#embedd dictionary\n",
    "topics_emb = {}\n",
    "\n",
    "for topic in topics:\n",
    "    label = topic[0]\n",
    "    words = topic[1].split(\",\")\n",
    "    words = [x.strip().replace('\"','') for x in words]\n",
    "    topic_emb = text_embedding(\" \".join(words))\n",
    "    topics_emb[label] = topic_emb\n",
    "    for word in words:\n",
    "        if len(text_embedding(word))>1 and len(text_embedding(word.lower()))>1:\n",
    "            print (word)\n",
    "\n",
    "\n",
    "\n",
    "topics\n",
    "\n",
    "print(type(topics))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sueddeutsche_relevant-migrant-news.csv\n",
      "3832\n",
      "0 3832\n",
      "100 3832\n",
      "200 3832\n",
      "300 3832\n",
      "400 3832\n",
      "500 3832\n",
      "600 3832\n",
      "700 3832\n",
      "800 3832\n",
      "900 3832\n",
      "1000 3832\n",
      "1100 3832\n",
      "1200 3832\n",
      "1300 3832\n",
      "1400 3832\n",
      "1500 3832\n",
      "1600 3832\n",
      "1700 3832\n",
      "1800 3832\n",
      "1900 3832\n",
      "2000 3832\n",
      "2100 3832\n",
      "2200 3832\n",
      "2300 3832\n",
      "2400 3832\n",
      "2500 3832\n",
      "2600 3832\n",
      "2700 3832\n",
      "2800 3832\n",
      "2900 3832\n",
      "3000 3832\n",
      "3100 3832\n",
      "3200 3832\n",
      "3300 3832\n",
      "3400 3832\n",
      "3500 3832\n",
      "3600 3832\n",
      "3700 3832\n",
      "3800 3832\n",
      "sputnik_relevant-migrant-news.csv\n",
      "3324\n",
      "0 3324\n",
      "100 3324\n",
      "200 3324\n",
      "300 3324\n",
      "400 3324\n",
      "500 3324\n",
      "600 3324\n",
      "700 3324\n",
      "800 3324\n",
      "900 3324\n",
      "1000 3324\n",
      "1100 3324\n",
      "1200 3324\n",
      "1300 3324\n",
      "1400 3324\n",
      "1500 3324\n",
      "1600 3324\n",
      "1700 3324\n",
      "1800 3324\n",
      "1900 3324\n",
      "2000 3324\n",
      "2100 3324\n",
      "2200 3324\n",
      "2300 3324\n",
      "2400 3324\n",
      "2500 3324\n",
      "2600 3324\n",
      "2700 3324\n",
      "2800 3324\n",
      "2900 3324\n",
      "3000 3324\n",
      "3100 3324\n",
      "3200 3324\n",
      "3300 3324\n",
      "welt_relevant-migrant-news.csv\n",
      "4073\n",
      "0 4073\n",
      "100 4073\n",
      "200 4073\n",
      "300 4073\n",
      "400 4073\n",
      "500 4073\n",
      "600 4073\n",
      "700 4073\n",
      "800 4073\n",
      "900 4073\n",
      "1000 4073\n",
      "1100 4073\n",
      "1200 4073\n",
      "1300 4073\n",
      "1400 4073\n",
      "1500 4073\n",
      "1600 4073\n",
      "1700 4073\n",
      "1800 4073\n",
      "1900 4073\n",
      "2000 4073\n",
      "2100 4073\n",
      "2200 4073\n",
      "2300 4073\n",
      "2400 4073\n",
      "2500 4073\n",
      "2600 4073\n",
      "2700 4073\n",
      "2800 4073\n",
      "2900 4073\n",
      "3000 4073\n",
      "3100 4073\n",
      "3200 4073\n",
      "3300 4073\n",
      "3400 4073\n",
      "3500 4073\n",
      "3600 4073\n",
      "3700 4073\n",
      "3800 4073\n",
      "3900 4073\n",
      "4000 4073\n",
      "bild_relevant-migrant-news.csv\n",
      "917\n",
      "0 917\n",
      "100 917\n",
      "200 917\n",
      "300 917\n",
      "400 917\n",
      "500 917\n",
      "600 917\n",
      "700 917\n",
      "800 917\n",
      "900 917\n",
      "rt_relevant-migrant-news.csv\n",
      "744\n",
      "0 744\n",
      "100 744\n",
      "200 744\n",
      "300 744\n",
      "400 744\n",
      "500 744\n",
      "600 744\n",
      "700 744\n",
      "taz_relevant-migrant-news.csv\n",
      "3361\n",
      "0 3361\n",
      "100 3361\n",
      "200 3361\n",
      "300 3361\n",
      "400 3361\n",
      "500 3361\n",
      "600 3361\n",
      "700 3361\n",
      "800 3361\n",
      "900 3361\n",
      "1000 3361\n",
      "1100 3361\n",
      "1200 3361\n",
      "1300 3361\n",
      "1400 3361\n",
      "1500 3361\n",
      "1600 3361\n",
      "1700 3361\n",
      "1800 3361\n",
      "1900 3361\n",
      "2000 3361\n",
      "2100 3361\n",
      "2200 3361\n",
      "2300 3361\n",
      "2400 3361\n",
      "2500 3361\n",
      "2600 3361\n",
      "2700 3361\n",
      "2800 3361\n",
      "2900 3361\n",
      "3000 3361\n",
      "3100 3361\n",
      "3200 3361\n",
      "3300 3361\n",
      "faz_relevant-migrant-news.csv\n",
      "9199\n",
      "0 9199\n",
      "100 9199\n",
      "200 9199\n",
      "300 9199\n",
      "400 9199\n",
      "500 9199\n",
      "600 9199\n",
      "700 9199\n",
      "800 9199\n",
      "900 9199\n",
      "1000 9199\n",
      "1100 9199\n",
      "1200 9199\n",
      "1300 9199\n",
      "1400 9199\n",
      "1500 9199\n",
      "1600 9199\n",
      "1700 9199\n",
      "1800 9199\n",
      "1900 9199\n",
      "2000 9199\n",
      "2100 9199\n",
      "2200 9199\n",
      "2300 9199\n",
      "2400 9199\n",
      "2500 9199\n",
      "2600 9199\n",
      "2700 9199\n",
      "2800 9199\n",
      "2900 9199\n",
      "3000 9199\n",
      "3100 9199\n",
      "3200 9199\n",
      "3300 9199\n",
      "3400 9199\n",
      "3500 9199\n",
      "3600 9199\n",
      "3700 9199\n",
      "3800 9199\n",
      "3900 9199\n",
      "4000 9199\n",
      "4100 9199\n",
      "4200 9199\n",
      "4300 9199\n",
      "4400 9199\n",
      "4500 9199\n",
      "4600 9199\n",
      "4700 9199\n",
      "4800 9199\n",
      "4900 9199\n",
      "5000 9199\n",
      "5100 9199\n",
      "5200 9199\n",
      "5300 9199\n",
      "5400 9199\n",
      "5500 9199\n",
      "5600 9199\n",
      "5700 9199\n",
      "5800 9199\n",
      "5900 9199\n",
      "6000 9199\n",
      "6100 9199\n",
      "6200 9199\n",
      "6300 9199\n",
      "6400 9199\n",
      "6500 9199\n",
      "6600 9199\n",
      "6700 9199\n",
      "6800 9199\n",
      "6900 9199\n",
      "7000 9199\n",
      "7100 9199\n",
      "7200 9199\n",
      "7300 9199\n",
      "7400 9199\n",
      "7500 9199\n",
      "7600 9199\n",
      "7700 9199\n",
      "7800 9199\n",
      "7900 9199\n",
      "8000 9199\n",
      "8100 9199\n",
      "8200 9199\n",
      "8300 9199\n",
      "8400 9199\n",
      "8500 9199\n",
      "8600 9199\n",
      "8700 9199\n",
      "8800 9199\n",
      "8900 9199\n",
      "9000 9199\n",
      "9100 9199\n"
     ]
    }
   ],
   "source": [
    "path1 = \"/Users/ashrakatelshehawy/Refugee Relevant Media pieces/\"\n",
    "\n",
    "#change the name of the outcome file according to which embedding has been used\n",
    "out = open(\"refugee-indomain-class-score.csv\",\"w\") #file output\n",
    "out.write(\"url,n_words,FinanceEconomy,DegradationCrime,PopulismConspiracyColRevolt\\n\")\n",
    "\n",
    "for filename in os.listdir(path1):\n",
    "    if \".csv\" in filename:\n",
    "        print (filename)\n",
    "        newspaper = open(path1+filename,\"r\").read().strip().split(\"\\n\")\n",
    "        print (len(newspaper))\n",
    "        for j in range(len(newspaper)):\n",
    "            article = newspaper[j]\n",
    "            article = article.split(\"\\t\")\n",
    "            url = article[0]\n",
    "            title = article[3]\n",
    "            content = article[4]\n",
    "            title_cont = title + \" \"+ content\n",
    "            n_words = str(len(title_cont.split(\" \")))\n",
    "            emb = text_embedding(title_cont)\n",
    "            scores = [url,n_words]\n",
    "            for topic,t_emb in topics_emb.items():\n",
    "                cs = cosine_similarity(emb, t_emb)[0][0]\n",
    "                scores.append(str(cs))\n",
    "            scores = \",\".join(scores)\n",
    "            out.write(scores+\"\\n\")\n",
    "            if j%100 == 0:\n",
    "                print (j,len(newspaper))\n",
    "out.close()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
